from lxml import etree
import pandas as pd
import re
import numpy as np
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=False)
Читаем XML файл с УК РФ и записываем в dataframe articles все статьи кодекса
with open('RFCriminalCode.xml') as file:
tree = etree.parse(file)
root = tree.getroot()
articles = pd.DataFrame(columns = ['number', 'section', 'section_name', 'chapter', 'chapter_name', 'name', 'body'])
for part in root.getchildren():
for section in part.getchildren():
for chapter in section.getchildren():
for article in chapter.getchildren():
articles = articles.append({'number': article.get('number'),
'section': section.get('number'),
'section_name': section.get('name'),
'chapter': chapter.get('number'),
'chapter_name': chapter.get('name'),
'name': article.get('name'),
'body': article.getchildren()[0].text
}, ignore_index=True)
articles = articles.dropna()
stemmer = SnowballStemmer('russian', ignore_stopwords=True)
def tokenize_and_stem(text):
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
only_word_tokens = [token for token in tokens if re.search('[а-яА-Я]', token)]
return [stemmer.stem(token) for token in only_word_tokens]
articles['stemmed_tokens'] = articles.body.map(tokenize_and_stem)
articles.body[0]
articles.head()
Посчитаем tf-idf для наших документов, убирая экстремальные значения частот (0.2 < tfidf < 0.8)
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, #max_features=200,
min_df=0.2, stop_words=nltk.corpus.stopwords.words('russian'),
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(articles.body)
print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
from sklearn.metrics.pairwise import cosine_distances
dist = cosine_distances(tfidf_matrix)
from sklearn.cluster import KMeans
num_clusters = 12 # Количество разделов уголовного кодекса
%time articles['km_cluster'] = KMeans(n_clusters=num_clusters).fit(tfidf_matrix).labels_.tolist()
from sklearn.cluster import DBSCAN
%time articles['db_cluster'] = DBSCAN(metric='precomputed').fit_predict(tfidf_matrix).tolist()
from sklearn.cluster import AgglomerativeClustering
%time articles['agg_cluster'] = AgglomerativeClustering(n_clusters=12).fit(tfidf_matrix.toarray()).labels_.tolist()
from sklearn.cluster import SpectralClustering
%time articles['spec_cluster'] = SpectralClustering().fit(tfidf_matrix).labels_.tolist()
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
%time pos2 = mds.fit_transform(dist)
mds = MDS(n_components=3, dissimilarity='precomputed', random_state=42)
%time pos3 = mds.fit_transform(dist)
articles['2d_pos'] = pos2.tolist()
articles['3d_pos'] = pos3.tolist()
def textfunc(row): return 'Кластер: ' + \
str(row['km_cluster']) + '<br>Номер статьи: ' + \
row['number'] + '<br>' + row['name']
data = go.Data([
go.Scatter(x=articles['2d_pos'].map(lambda x: x[0]),
y=articles['2d_pos'].map(lambda x: x[1]),
mode='markers',
marker=go.Marker(
size=8, color=articles['km_cluster'].astype(float), colorscale='Jet'),
text=articles.apply(textfunc, axis=1),
showlegend=False,
hoverinfo='text'),
])
figure = go.Figure(data=data, layout=go.Layout(
title='УК РФ TF-IDF MDS KMeans(n_clusters=12)'))
plotly.offline.iplot(figure)
def textfunc(row): return 'Кластер: ' + \
str(row['spec_cluster']) + '<br>Номер статьи: ' + \
row['number'] + '<br>' + row['name']
data = go.Data([
go.Scatter(x=articles['2d_pos'].map(lambda x: x[0]),
y=articles['2d_pos'].map(lambda x: x[1]),
mode='markers',
marker=go.Marker(
size=8, color=articles['spec_cluster'].astype(float), colorscale='Jet'),
text=articles.apply(textfunc, axis=1),
showlegend=False,
hoverinfo='text'),
])
figure = go.Figure(data=data, layout=go.Layout(
title='УК РФ TF-IDF MDS Spectral clustering'))
plotly.offline.iplot(figure)
data = go.Data([go.Scatter3d(x=articles['3d_pos'].map(lambda x: x[0]),
y=articles['3d_pos'].map(lambda x: x[1]),
z=articles['3d_pos'].map(lambda x: x[2]),
mode='markers',
marker=go.Marker(
size=3, color=articles['km_cluster'], colorscale='Jet'),
text=articles.apply(textfunc, axis=1),
hoverinfo='text')])
figure = go.Figure(data=data, layout=go.Layout(
title='УК РФ TF-IDF MDS KMeans(n_clusters=12)'))
plotly.offline.iplot(figure)
def sectiontextfunc(row): return '<br>Номер статьи: ' + \
row['number'] + '<br>' + row['name'] + '<br>Раздел ' + row['section'] + ' ' + row['section_name']
data = go.Data([
go.Scatter(x=articles['2d_pos'].map(lambda x: x[0]),
y=articles['2d_pos'].map(lambda x: x[1]),
mode='markers',
marker=go.Marker(
size=8, color=articles['section'].astype('category').cat.codes, colorscale='Jet'),
text=articles.apply(sectiontextfunc, axis=1),
showlegend=False,
hoverinfo='text'),
])
figure = go.Figure(data=data, layout=go.Layout(
title='УК РФ TF-IDF MDS Разделы кодекса'))
plotly.offline.iplot(figure)
def sectiontextfunc(row): return '<br>Номер статьи: ' + \
row['number'] + '<br>' + row['name'] + '<br>Раздел ' + row['section'] + ' ' + row['section_name']
data = go.Data([go.Scatter3d(x=articles['3d_pos'].map(lambda x: x[0]),
y=articles['3d_pos'].map(lambda x: x[1]),
z=articles['3d_pos'].map(lambda x: x[2]),
mode='markers',
marker=go.Marker(
size=3, color=articles['section'].astype('category').cat.codes, colorscale='Jet'),
text=articles.apply(sectiontextfunc, axis=1),
hoverinfo='text')])
figure = go.Figure(data=data, layout=go.Layout(
title='УК РФ TF-IDF Разделы кодекса'))
plotly.offline.iplot(figure)
from sklearn.manifold import TSNE
%time articles['2d_tsne'] = TSNE(perplexity=5, metric='precomputed').fit_transform(dist).tolist()
%time articles['3d_tsne'] = TSNE(n_components=3,perplexity=5, metric='precomputed').fit_transform(dist).tolist()
def textfunc(row): return 'Кластер: ' + \
str(row['agg_cluster']) + '<br>Номер статьи: ' + \
row['number'] + '<br>' + row['name']
data = go.Data([go.Scatter(x=articles['2d_tsne'].map(lambda x: x[0]),
y=articles['2d_tsne'].map(lambda x: x[1]),
mode='markers',
marker=go.Marker(
size=10, color=articles['agg_cluster'], colorscale='Jet'),
text=articles.apply(textfunc, axis=1),
hoverinfo='text')])
layout = go.Layout(title='УК РФ TF-IDF t-SNE(perplexity=5) Agglomerative Clustering')
figure = go.Figure(data=data, layout=layout)
plotly.offline.iplot(figure)
def textfunc(row): return 'Кластер: ' + \
str(row['agg_cluster']) + '<br>Номер статьи: ' + \
row['number'] + '<br>' + row['name']
data = go.Data([go.Scatter3d(x=articles['3d_tsne'].map(lambda x: x[0]),
y=articles['3d_tsne'].map(lambda x: x[1]),
z=articles['3d_tsne'].map(lambda x: x[2]),
mode='markers',
marker=go.Marker(
size=3, color=articles['agg_cluster'], colorscale='Jet'),
text=articles.apply(textfunc, axis=1),
hoverinfo='text')])
figure = go.Figure(data=data, layout=go.Layout(
title='УК РФ TF-IDF t-SNE(perplexity=5) Agglomerative clustering'))
plotly.offline.iplot(figure)
data = go.Data([go.Scatter(x=articles['2d_tsne'].map(lambda x: x[0]),
y=articles['2d_tsne'].map(lambda x: x[1]),
mode='markers',
marker=go.Marker(
size=10, color=articles['section'].astype('category').cat.codes, colorscale='Jet'),
text=articles.apply(sectiontextfunc, axis=1),
hoverinfo='text')])
layout = go.Layout(title='УК РФ TF-IDF t-SNE(perplexity=5) Разделы кодекса')
figure = go.Figure(data=data, layout=layout)
plotly.offline.iplot(figure)
data = go.Data([go.Scatter3d(x=articles['3d_tsne'].map(lambda x: x[0]),
y=articles['3d_tsne'].map(lambda x: x[1]),
z=articles['3d_tsne'].map(lambda x: x[2]),
mode='markers',
marker=go.Marker(
size=3, color=articles['section'].astype('category').cat.codes, colorscale='Jet'),
text=articles.apply(sectiontextfunc, axis=1),
hoverinfo='text')])
figure = go.Figure(data=data, layout=go.Layout(
title='УК РФ TF-IDF t-SNE(perplexity=5) Разделы кодекса'))
plotly.offline.iplot(figure)
Возможно стоило удалить все имена собственные из текстов статей. Однако, стоит заметить, что кроме российская федерация формулировок с именами собственными natasha не находит:
# from natasha import LocationExtractor
# extractor = LocationExtractor()
# for text in articles['body']:
# matches = extractor(text)
# for match in matches:
# print(match.span, match.fact)
import string
def strip_proppers(text):
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
return ''.join([" " + i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()
from nltk.tag import pos_tag
def strip_proppers_POS(text):
tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
return non_propernouns
from gensim import corpora, models, similarities
#remove proper names
%time preprocess = [strip_proppers(doc) for doc in articles['body']]
#tokenize
%time tokenized_text = [tokenize_and_stem(text) for text in preprocess]
#remove stop words
%time texts = [[word for word in text if word not in nltk.corpus.stopwords.words('russian')] for text in tokenized_text]
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, no_above=0.8)
corpus = [dictionary.doc2bow(text) for text in texts]
%time lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100)
lda.show_topics(formatted=True, num_words=5)